-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[VectorCombine] Scalarize binop-like intrinsics #138095
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VectorCombine] Scalarize binop-like intrinsics #138095
Conversation
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Luke Lau (lukel97) ChangesCurrently VectorCombine can scalarize vector compares and binary ops. This extends it to also scalarize binary-op like intrinsics like umax, minnum etc. The motivation behind this is to scalarize more intrinsics in VectorCombine rather than in DAGCombine, so we can sink splats across basic blocks: see #137786 This currently has very little effect on generated code because InstCombine doesn't yet canonicalize binary intrinsics where one operand is a constant into the form that VectorCombine expects, i.e. This uses It also only handles intrinsics with two operands. In the future we would generalize this to handle arbitrary numbers of operands, including unary operators too, e.g. fneg or fma. Full diff: https://github.com/llvm/llvm-project/pull/138095.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 04c084ffdda97..7a7c533267f6f 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -48,6 +48,7 @@ STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
STATISTIC(NumScalarBO, "Number of scalar binops formed");
STATISTIC(NumScalarCmp, "Number of scalar compares formed");
+STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
static cl::opt<bool> DisableVectorCombine(
"disable-vector-combine", cl::init(false), cl::Hidden,
@@ -1016,21 +1017,29 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
return true;
}
-/// Match a vector binop or compare instruction with at least one inserted
-/// scalar operand and convert to scalar binop/cmp followed by insertelement.
+/// Match a vector binop, compare or binop-like intrinsic with at least one
+/// inserted scalar operand and convert to scalar binop/cmp/intrinsic followed
+/// by insertelement.
bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
Value *Ins0, *Ins1;
if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
- !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
- return false;
+ !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) {
+ if (auto *II = dyn_cast<IntrinsicInst>(&I);
+ II && II->arg_size() == 2 &&
+ isTriviallyVectorizable(II->getIntrinsicID())) {
+ Ins0 = II->getArgOperand(0);
+ Ins1 = II->getArgOperand(1);
+ } else {
+ return false;
+ }
+ }
// Do not convert the vector condition of a vector select into a scalar
// condition. That may cause problems for codegen because of differences in
// boolean formats and register-file transfers.
// TODO: Can we account for that in the cost model?
- bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
- if (IsCmp)
+ if (isa<CmpInst>(I))
for (User *U : I.users())
if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
return false;
@@ -1085,15 +1094,24 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
unsigned Opcode = I.getOpcode();
InstructionCost ScalarOpCost, VectorOpCost;
- if (IsCmp) {
+ if (isa<CmpInst>(I)) {
CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
ScalarOpCost = TTI.getCmpSelInstrCost(
Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
VectorOpCost = TTI.getCmpSelInstrCost(
Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
- } else {
+ } else if (isa<BinaryOperator>(I)) {
ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
+ } else if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+ IntrinsicCostAttributes ScalarICA(
+ II->getIntrinsicID(), ScalarTy,
+ SmallVector<Type *>(II->arg_size(), ScalarTy));
+ ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
+ IntrinsicCostAttributes VectorICA(
+ II->getIntrinsicID(), VecTy,
+ SmallVector<Type *>(II->arg_size(), VecTy));
+ VectorOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
}
// Get cost estimate for the insert element. This cost will factor into
@@ -1112,10 +1130,12 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
// vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
// inselt NewVecC, (scalar_op V0, V1), Index
- if (IsCmp)
+ if (isa<CmpInst>(I))
++NumScalarCmp;
- else
+ else if (isa<BinaryOperator>(I))
++NumScalarBO;
+ else if (isa<IntrinsicInst>(I))
+ ++NumScalarIntrinsic;
// For constant cases, extract the scalar element, this should constant fold.
if (IsConst0)
@@ -1123,9 +1143,15 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
if (IsConst1)
V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
- Value *Scalar =
- IsCmp ? Builder.CreateCmp(Pred, V0, V1)
- : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+ Value *Scalar;
+ if (isa<CmpInst>(I))
+ Scalar = Builder.CreateCmp(Pred, V0, V1);
+ else if (isa<BinaryOperator>(I))
+ Scalar = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+ else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), {V0, V1});
+ else
+ llvm_unreachable("Unexpected instruction type");
Scalar->setName(I.getName() + ".scalar");
@@ -1135,9 +1161,15 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
ScalarInst->copyIRFlags(&I);
// Fold the vector constants in the original vectors into a new base vector.
- Value *NewVecC =
- IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1)
- : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
+ Value *NewVecC;
+ if (isa<CmpInst>(I))
+ NewVecC = Builder.CreateCmp(Pred, VecC0, VecC1);
+ else if (isa<BinaryOperator>(I))
+ NewVecC = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
+ else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ NewVecC = Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), {VecC0, VecC1});
+ else
+ llvm_unreachable("Unexpected instruction type");
Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
replaceValue(I, *Insert);
return true;
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
new file mode 100644
index 0000000000000..5a25f5faf8911
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -p vector-combine | FileCheck %s
+
+define <4 x i32> @umax_fixed(i32 %x, i32 %y) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> poison)
+; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT: ret <4 x i32> [[V]]
+;
+ %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+ %y.insert = insertelement <4 x i32> poison, i32 %y, i32 0
+ %v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> %y.insert)
+ ret <4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable(i32 %x, i32 %y) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT: ret <vscale x 4 x i32> [[V]]
+;
+ %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+ %y.insert = insertelement <vscale x 4 x i32> poison, i32 %y, i32 0
+ %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> %y.insert)
+ ret <vscale x 4 x i32> %v
+}
+
+define <4 x i32> @umax_fixed_lhs_const(i32 %x) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed_lhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 1, i32 [[X]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> poison)
+; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT: ret <4 x i32> [[V]]
+;
+ %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+ %v = call <4 x i32> @llvm.umax(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> %x.insert)
+ ret <4 x i32> %v
+}
+
+define <4 x i32> @umax_fixed_rhs_const(i32 %x) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed_rhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 1)
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT: ret <4 x i32> [[V]]
+;
+ %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+ %v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+ ret <4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable_lhs_const(i32 %x) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_lhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 42, i32 [[X]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT: ret <vscale x 4 x i32> [[V]]
+;
+ %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+ %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> %x.insert)
+ ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable_rhs_const(i32 %x) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_rhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 42)
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> splat (i32 42))
+; CHECK-NEXT: [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT: ret <vscale x 4 x i32> [[V]]
+;
+ %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+ %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> splat (i32 42))
+ ret <vscale x 4 x i32> %v
+}
+
+; Shouldn't be scalarized, not a "trivially vectorizable" intrinsic.
+define <4 x i32> @non_trivially_vectorizable(i32 %x, i32 %y) {
+; CHECK-LABEL: define <4 x i32> @non_trivially_vectorizable(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT: [[Y_INSERT:%.*]] = insertelement <8 x i32> poison, i32 [[Y]], i32 0
+; CHECK-NEXT: [[V:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[X_INSERT]], <8 x i32> [[Y_INSERT]])
+; CHECK-NEXT: ret <4 x i32> [[V]]
+;
+ %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+ %y.insert = insertelement <8 x i32> poison, i32 %y, i32 0
+ %v = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %x.insert, <8 x i32> %y.insert)
+ ret <4 x i32> %v
+}
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Please wait for additional approval from other reviewers.
Following from the discussion in llvm#138095 (comment), these intrinsics are poison if any of their operands are poison, and are marked as such in propagatesPoison in ValueTracking.cpp. This will help fold away leftover vectors produced by VectorCombine when scalarizing intrinsics.
Following from the discussion in #138095 (comment), these intrinsics are poison if any of their operands are poison, and are marked as such in propagatesPoison in ValueTracking.cpp. This will help fold away leftover vectors produced by VectorCombine when scalarizing intrinsics.
…ne/scalarize-intrinsics
…ting tests. NFC (#138395) This adds a test that exercises the part of scalarizeBinOpOrCmp that produces immediate UB as described in #138095 (comment), but is fortunately currently folded into a correct transform. I also noticed a bunch of immediate UB in some of the existing tests so this also cleans them up. They should still all be scalarized though.
Following from the discussion in llvm#138095 (comment), these intrinsics are poison if any of their operands are poison, and are marked as such in propagatesPoison in ValueTracking.cpp. This will help fold away leftover vectors produced by VectorCombine when scalarizing intrinsics.
…ting tests. NFC (llvm#138395) This adds a test that exercises the part of scalarizeBinOpOrCmp that produces immediate UB as described in llvm#138095 (comment), but is fortunately currently folded into a correct transform. I also noticed a bunch of immediate UB in some of the existing tests so this also cleans them up. They should still all be scalarized though.
…38166) Following from the discussion in llvm/llvm-project#138095 (comment), these intrinsics are poison if any of their operands are poison, and are marked as such in propagatesPoison in ValueTracking.cpp. This will help fold away leftover vectors produced by VectorCombine when scalarizing intrinsics.
…B from existing tests. NFC (#138395) This adds a test that exercises the part of scalarizeBinOpOrCmp that produces immediate UB as described in llvm/llvm-project#138095 (comment), but is fortunately currently folded into a correct transform. I also noticed a bunch of immediate UB in some of the existing tests so this also cleans them up. They should still all be scalarized though.
Following from the discussion in llvm#138095 (comment), these intrinsics are poison if any of their operands are poison, and are marked as such in propagatesPoison in ValueTracking.cpp. This will help fold away leftover vectors produced by VectorCombine when scalarizing intrinsics.
…ting tests. NFC (llvm#138395) This adds a test that exercises the part of scalarizeBinOpOrCmp that produces immediate UB as described in llvm#138095 (comment), but is fortunately currently folded into a correct transform. I also noticed a bunch of immediate UB in some of the existing tests so this also cleans them up. They should still all be scalarized though.
|
Gentle ping |
| @@ -0,0 +1,122 @@ | |||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | |||
| ; RUN: opt < %s -S -p vector-combine | FileCheck %s | |||
|
|
|||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we have any tests that we can get different results based off costs? might need to be moved into a specific target?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point, I've added a test that shows a difference in costs between sse/avx in 3f305fd
Should be profitable on AVX2 but not SSE2
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…#141300) We currently pull shuffles through binops and intrinsics, which is an important canonical form for VectorCombine to be able to scalarize vector sequences. But while binops can be folded with a constant operand, intrinsics currently require all operands to be shufflevectors. This extends intrinsic folding to be in line with regular binops by reusing the constant "unshuffling" logic. As far as I can tell the list of currently folded intrinsics don't require any special UB handling. This change in combination with #138095 and #137823 fixes the following C: ```c void max(int *x, int *y, int n) { for (int i = 0; i < n; i++) x[i] += *y > 42 ? *y : 42; } ``` Not using the splatted vector form on RISC-V with `-O3 -march=rva23u64`: ```asm vmv.s.x v8, a4 li a4, 42 vmax.vx v10, v8, a4 vrgather.vi v8, v10, 0 .LBB0_9: # %vector.body # =>This Inner Loop Header: Depth=1 vl2re32.v v10, (a5) vadd.vv v10, v10, v8 vs2r.v v10, (a5) ``` i.e., it now generates ```asm li a6, 42 max a6, a4, a6 .LBB0_9: # %vector.body # =>This Inner Loop Header: Depth=1 vl2re32.v v8, (a5) vadd.vx v8, v8, a6 vs2r.v v8, (a5) ```
Currently VectorCombine can scalarize vector compares and binary ops. This extends it to also scalarize binary-op like intrinsics like umax, minnum etc.
The motivation behind this is to scalarize more intrinsics in VectorCombine rather than in DAGCombine, so we can sink splats across basic blocks: see #137786
This currently has very little effect on generated code because InstCombine doesn't yet canonicalize binary intrinsics where one operand is a constant into the form that VectorCombine expects, i.e.
binop (shuffle insert) const --> shuffle (binop insert const). The plan is to land this first and then in a subsequent patch teach InstCombine to do the canonicalization to avoid regressions in the meantime.This uses
isTriviallyVectorizableto determine whether or not an intrinsic is safe to scalarize. There's alsoisTriviallyScalarizable, but this seems more geared towards the Scalarizer pass and includes intrinsics with multiple return values.It also only handles intrinsics with two operands with the same type as the return type. In the future we would generalize this to handle arbitrary numbers of operands, including unary operators too, e.g. fneg or fma, as well as different operand types, e.g. powi or scmp